{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from tqdm import tqdm_notebook as tqdm\n",
    "from sklearn.externals import joblib\n",
    "%matplotlib inline\n",
    "import seaborn as sns\n",
    "\n",
    "DIR = 'PATH/TO/YOUR/DATA'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = pd.read_csv(os.path.join(DIR,'files/unzipped_data/application_train.csv'), nrows=None)\n",
    "X.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Preprocessing\n",
    "## Solution 3"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "[Martin Kotek (Competition Host): \"Value 365243 denotes infinity in DAYS variables in the datasets, therefore you can consider them NA values. Also XNA/XAP denote NA values.\"](https://www.kaggle.com/c/home-credit-default-risk/discussion/57247)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X['CODE_GENDER'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "X.loc[X['DAYS_EMPLOYED'] > 0]['DAYS_EMPLOYED'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sum(X['ORGANIZATION_TYPE'] == 'XNA')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X['CODE_GENDER'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X['CODE_GENDER'].replace('XNA',np.nan, inplace=True)\n",
    "X['CODE_GENDER'].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Solution 5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X['DAYS_LAST_PHONE_CHANGE'].replace(0, np.nan, inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Feature Engineering\n",
    "## Solution 3\n",
    "### Hand crafted features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X['annuity_income_percentage'] = X['AMT_ANNUITY'] / X['AMT_INCOME_TOTAL']\n",
    "X['car_to_birth_ratio'] = X['OWN_CAR_AGE'] / X['DAYS_BIRTH']\n",
    "X['car_to_employ_ratio'] = X['OWN_CAR_AGE'] / X['DAYS_EMPLOYED']\n",
    "X['children_ratio'] = X['CNT_CHILDREN'] / X['CNT_FAM_MEMBERS']\n",
    "X['credit_to_annuity_ratio'] = X['AMT_CREDIT'] / X['AMT_ANNUITY']\n",
    "X['credit_to_goods_ratio'] = X['AMT_CREDIT'] / X['AMT_GOODS_PRICE']\n",
    "X['credit_to_income_ratio'] = X['AMT_CREDIT'] / X['AMT_INCOME_TOTAL']\n",
    "X['days_employed_percentage'] = X['DAYS_EMPLOYED'] / X['DAYS_BIRTH']\n",
    "X['income_credit_percentage'] = X['AMT_INCOME_TOTAL'] / X['AMT_CREDIT']\n",
    "X['income_per_child'] = X['AMT_INCOME_TOTAL'] / (1 + X['CNT_CHILDREN'])\n",
    "X['income_per_person'] = X['AMT_INCOME_TOTAL'] / X['CNT_FAM_MEMBERS']\n",
    "X['payment_rate'] = X['AMT_ANNUITY'] / X['AMT_CREDIT']\n",
    "X['phone_to_birth_ratio'] = X['DAYS_LAST_PHONE_CHANGE'] / X['DAYS_BIRTH']\n",
    "X['phone_to_employ_ratio'] = X['DAYS_LAST_PHONE_CHANGE'] / X['DAYS_EMPLOYED']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# External sources\n",
    "X['external_sources_weighted'] = X.EXT_SOURCE_1 * 2 + X.EXT_SOURCE_2 * 3 + X.EXT_SOURCE_3 * 4\n",
    "for function_name in ['min', 'max', 'sum', 'mean', 'nanmedian']:\n",
    "    X['external_sources_{}'.format(function_name)] = eval('np.{}'.format(function_name))(\n",
    "        X[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "engineered_numerical_columns = ['annuity_income_percentage',\n",
    "                                'car_to_birth_ratio',\n",
    "                                'car_to_employ_ratio',\n",
    "                                'children_ratio',\n",
    "                                'credit_to_annuity_ratio',\n",
    "                                'credit_to_goods_ratio',\n",
    "                                'credit_to_income_ratio',\n",
    "                                'days_employed_percentage',\n",
    "                                'income_credit_percentage',\n",
    "                                'income_per_child',\n",
    "                                'income_per_person',\n",
    "                                'payment_rate',\n",
    "                                'phone_to_birth_ratio',\n",
    "                                'phone_to_employ_ratio',\n",
    "                                'external_sources_weighted',\n",
    "                                'external_sources_min',\n",
    "                                'external_sources_max',\n",
    "                                'external_sources_sum',\n",
    "                                'external_sources_mean',\n",
    "                                'external_sources_nanmedian']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_eng = X[engineered_numerical_columns + ['TARGET']]\n",
    "X_eng_corr = abs(X_eng.corr())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_eng_corr.sort_values('TARGET', ascending=False)['TARGET']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.heatmap(X_eng_corr, \n",
    "            xticklabels=X_eng_corr.columns,\n",
    "            yticklabels=X_eng_corr.columns)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Aggregation features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "AGGREGATION_RECIPIES = [\n",
    "    (['CODE_GENDER', 'NAME_EDUCATION_TYPE'], [('AMT_ANNUITY', 'max'),\n",
    "                                              ('AMT_CREDIT', 'max'),\n",
    "                                              ('EXT_SOURCE_1', 'mean'),\n",
    "                                              ('EXT_SOURCE_2', 'mean'),\n",
    "                                              ('OWN_CAR_AGE', 'max'),\n",
    "                                              ('OWN_CAR_AGE', 'sum')]),\n",
    "    (['CODE_GENDER', 'ORGANIZATION_TYPE'], [('AMT_ANNUITY', 'mean'),\n",
    "                                            ('AMT_INCOME_TOTAL', 'mean'),\n",
    "                                            ('DAYS_REGISTRATION', 'mean'),\n",
    "                                            ('EXT_SOURCE_1', 'mean')]),\n",
    "    (['CODE_GENDER', 'REG_CITY_NOT_WORK_CITY'], [('AMT_ANNUITY', 'mean'),\n",
    "                                                 ('CNT_CHILDREN', 'mean'),\n",
    "                                                 ('DAYS_ID_PUBLISH', 'mean')]),\n",
    "    (['CODE_GENDER', 'NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE', 'REG_CITY_NOT_WORK_CITY'], [('EXT_SOURCE_1', 'mean'),\n",
    "                                                                                           ('EXT_SOURCE_2', 'mean')]),\n",
    "    (['NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE'], [('AMT_CREDIT', 'mean'),\n",
    "                                                  ('AMT_REQ_CREDIT_BUREAU_YEAR', 'mean'),\n",
    "                                                  ('APARTMENTS_AVG', 'mean'),\n",
    "                                                  ('BASEMENTAREA_AVG', 'mean'),\n",
    "                                                  ('EXT_SOURCE_1', 'mean'),\n",
    "                                                  ('EXT_SOURCE_2', 'mean'),\n",
    "                                                  ('EXT_SOURCE_3', 'mean'),\n",
    "                                                  ('NONLIVINGAREA_AVG', 'mean'),\n",
    "                                                  ('OWN_CAR_AGE', 'mean'),\n",
    "                                                  ('YEARS_BUILD_AVG', 'mean')]),\n",
    "    (['NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE', 'REG_CITY_NOT_WORK_CITY'], [('ELEVATORS_AVG', 'mean'),\n",
    "                                                                            ('EXT_SOURCE_1', 'mean')]),\n",
    "    (['OCCUPATION_TYPE'], [('AMT_ANNUITY', 'mean'),\n",
    "                           ('CNT_CHILDREN', 'mean'),\n",
    "                           ('CNT_FAM_MEMBERS', 'mean'),\n",
    "                           ('DAYS_BIRTH', 'mean'),\n",
    "                           ('DAYS_EMPLOYED', 'mean'),\n",
    "                           ('DAYS_ID_PUBLISH', 'mean'),\n",
    "                           ('DAYS_REGISTRATION', 'mean'),\n",
    "                           ('EXT_SOURCE_1', 'mean'),\n",
    "                           ('EXT_SOURCE_2', 'mean'),\n",
    "                           ('EXT_SOURCE_3', 'mean')]),\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "groupby_aggregate_names = []\n",
    "for groupby_cols, specs in tqdm(AGGREGATION_RECIPIES):\n",
    "    group_object = X.groupby(groupby_cols)\n",
    "    for select, agg in tqdm(specs):\n",
    "        groupby_aggregate_name = '{}_{}_{}'.format('_'.join(groupby_cols), agg, select)\n",
    "        X = X.merge(group_object[select]\n",
    "                              .agg(agg)\n",
    "                              .reset_index()\n",
    "                              .rename(index=str,\n",
    "                                      columns={select: groupby_aggregate_name})\n",
    "                              [groupby_cols + [groupby_aggregate_name]],\n",
    "                              on=groupby_cols,\n",
    "                              how='left')\n",
    "        groupby_aggregate_names.append(groupby_aggregate_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_agg = X[groupby_aggregate_names + ['TARGET']]\n",
    "X_agg_corr = abs(X_agg.corr())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_agg_corr.sort_values('TARGET', ascending=False)['TARGET']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(15,8))\n",
    "sns.heatmap(X_agg_corr, \n",
    "            xticklabels=X_agg_corr.columns,\n",
    "            yticklabels=X_agg_corr.columns)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Solution 4 \n",
    "### Hand crafted features\n",
    "* diff features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "diff_feature_names = []\n",
    "for groupby_cols, specs in tqdm(AGGREGATION_RECIPIES):\n",
    "    for select, agg in tqdm(specs):\n",
    "        if agg in ['mean','median','max','min']:\n",
    "            groupby_aggregate_name = '{}_{}_{}'.format('_'.join(groupby_cols), agg, select)\n",
    "            diff_name = '{}_diff'.format(groupby_aggregate_name)\n",
    "            abs_diff_name = '{}_abs_diff'.format(groupby_aggregate_name)\n",
    "\n",
    "            X[diff_name] = X[select] - X[groupby_aggregate_name] \n",
    "            X[abs_diff_name] = np.abs(X[select] - X[groupby_aggregate_name]) \n",
    "\n",
    "            diff_feature_names.append(diff_name)\n",
    "            diff_feature_names.append(abs_diff_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_diff = X[diff_feature_names + ['TARGET']]\n",
    "X_diff_corr = abs(X_diff.corr())\n",
    "X_diff_corr.sort_values('TARGET', ascending=False)['TARGET']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "* unemployed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X['DAYS_EMPLOYED'].replace(365243,np.nan, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_clean = X[~pd.isnull(X['DAYS_EMPLOYED'])]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.distplot(X_clean['DAYS_EMPLOYED'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X['DAYS_EMPLOYED'].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X['long_employment'] = (X['DAYS_EMPLOYED'] < -2000).astype(int)\n",
    "feature_names = ['long_employment']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_employment = X[feature_names +['DAYS_EMPLOYED'] + ['TARGET']]\n",
    "X_employment_corr = abs(X_employment.corr())\n",
    "X_employment_corr.sort_values('TARGET', ascending=False)['TARGET']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.heatmap(X_employment_corr, \n",
    "            xticklabels=X_employment_corr.columns,\n",
    "            yticklabels=X_employment_corr.columns)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "* age binns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.distplot(X['DAYS_BIRTH'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X['retirement_age'] = (X['DAYS_BIRTH'] < -14000).astype(int)\n",
    "feature_names = ['DAYS_BIRTH','retirement_age']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_age = X[feature_names + ['DAYS_BIRTH'] + ['TARGET']]\n",
    "X_age_corr = abs(X_age.corr())\n",
    "X_age_corr.sort_values('TARGET', ascending=False)['TARGET']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.heatmap(X_age_corr, \n",
    "            xticklabels=X_age_corr.columns,\n",
    "            yticklabels=X_age_corr.columns)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Solution 5\n",
    "## Hand Crafted"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X['cnt_non_child'] = X['CNT_FAM_MEMBERS'] - X['CNT_CHILDREN']\n",
    "X['child_to_non_child_ratio'] = X['CNT_CHILDREN'] / X['cnt_non_child']\n",
    "X['income_per_non_child'] = X['AMT_INCOME_TOTAL'] / X['cnt_non_child']\n",
    "X['credit_per_person'] = X['AMT_CREDIT'] / X['CNT_FAM_MEMBERS']\n",
    "X['credit_per_child'] = X['AMT_CREDIT'] / (1 + X['CNT_CHILDREN'])\n",
    "X['credit_per_non_child'] = X['AMT_CREDIT'] / X['cnt_non_child']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "feature_names = ['cnt_non_child',\n",
    "                 'child_to_non_child_ratio',\n",
    "                 'income_per_non_child',\n",
    "                 'credit_per_person',\n",
    "                 'credit_per_child',\n",
    "                 'credit_per_non_child']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_family = X[feature_names + ['TARGET']]\n",
    "X_family_corr = abs(X_family.corr())\n",
    "X_family_corr.sort_values('TARGET', ascending=False)['TARGET']"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
